;***************************************
;* procs_b.inc file. Some procedures  **
;***************************************
;; procs_b.inc procedures list:
;;  aabb_inter                               `
;;  calc_combo
;;  count_transparent_voxels
;;  do_variable_slices_ptr_buff
;;  intersect_tri
;;  make_bumps
;;  make_series
;;  mark_colided_edges
;;  mark_colided_edges_th
;;  morph_anim
;;  rem_tri
;;  remove_non_tri
;;  to_pieces
;;  to_pieces_anim
;include "labs.inc"
;===================================================
if 0
flood_sketch:

       .skb_border  equ dword [ebp-4]
       .mrk_chu_ptr equ dword [ebp-8]
       .chu_mrk_cnt equ word  [ebp-10]
       .marker      equ byte  [ebp-11]
       push      ebp
       mov       ebp,esp
       sub       esp,15
       mov       edi,[Zbuffer_ptr]
       mov       esi,[sketch_buff_ptr]
       mov       eax,esi
       add       eax,8
       mov       .skb_border,eax
       or        esi,esi
       jz        .end22

       movzx     ecx,[xres_var]
       movzx     ebx,[yres_var]
       imul      ecx,ebx
       shr       ecx,3 + 2
       inc       ecx
       push      ecx
       xor       ebx,ebx
       cld
     .loop_scat:
       lodsd
       push      ecx
       xor       ecx,ecx
     .llsc:
       bt        ebx,0
       jc        .Z
       bt        eax,ecx
       jc        .Z
       add       edi,4
       inc       ecx
       cmp       ecx,32
       jne       .llsc
     .nx_in_sc:
       mov       [esi-4],eax
       pop       ecx
       loop      .loop_scat
       jmp       .end11
     .Zclear:
       xor       ebx,ebx
     .Z:
       add       edi,4
       inc       ecx
       cmp       ecx,32
       je        .nx_in_sc
       cmp       [edi],dword 60000.1
       je        .Zclear
       bts       eax,ecx
       bts       ebx,0
       jmp       .Z

     .end11:
       pop       ecx
       std
       xor       ebx,ebx
     .loop_scat2:
       lodsd
       push      ecx
       mov       ecx,32
     .llsc2:
       bt        ebx,0
       jc        .Z2
       bt        eax,ecx
       jc        .Z2
       sub       edi,4
    ;   dec       ecx
    ;   cmp       ecx,0
    ;   jnl       .llsc2
       loop      .llsc2
     .nx_in_sc2:
       cmp       esi,.skb_border
       jbe       .end22
       mov       [esi+4],eax
       pop       ecx
       loop      .loop_scat2
       jz       .end22
     .Zclear2:
       xor       ebx,ebx
     .Z2:
       sub       edi,4
       dec       ecx
       jz        .nx_in_sc2
       cmp       [edi],dword 60000.1
       je        .Zclear2
       bts       eax,ecx
       bts       ebx,0
       jmp       .Z2
    .end22:
       mov      esp,ebp
       pop      ebp
ret
end if
sketch_3d:
       .xy_cnt       equ [ebp-4]
       .tcnt         equ dword[ebp-8]
       .pcnt         equ dword[ebp-12]
       .one          equ [ebp-16]
       .pts_ptr      equ [ebp-20]
       .curr_p_ptr   equ [ebp-24]
       .tris_ptr     equ [ebp-28]
       .curr_t_ptr   equ [ebp-32]
       .mx           equ [ebp-72]

       push      ebp
       mov       ebp,esp
       xor       eax,eax
       mov       ebx,[triangles_count_var]
       mov       edx,[points_count_var]
       push      eax ebx edx
       or        eax,1
       push      eax
       sub       esp,80
   ;   mov       esi,matrix ;_scaled
   ;   lea       edi,.mx
   ;   call      reverse_mx_3x3

       mov       esi,[sketch_buff_ptr]
       or        esi,esi
       jz        .end
       cld
       movzx     ecx,[xres_var]
       movzx     ebx,[yres_var]
       imul      ecx,ebx
       shr       ecx,3 + 2
       push      ecx
       xor       ebx,ebx
     @@:
       lodsd
       popcnt    edx,eax
       add       ebx,edx
       loop      @b
       ; edx = verts number

       mov       eax,.pcnt ;[points_count_var]
       imul      eax,12
       add       ebx,100
       imul      ebx,12*8
       add       eax,ebx
       malloc    eax
       mov       .pts_ptr,eax
       mov       .curr_p_ptr,eax

       mov       eax,.tcnt ;[triangles_count_var]
       imul      eax,12
       add       eax,ebx
       malloc    eax
       mov       .tris_ptr,eax
       mov       .curr_t_ptr,eax
       xor       ecx,ecx
       cmp       .pcnt,ecx
       jz        @f
       mov       esi,[triangles_ptr]
       mov       edi,eax
       mov       ecx,.tcnt ;[triangles_count_var]
       lea       ecx,[ecx*3]
       cld
       rep       movsd
       mov       .curr_t_ptr,edi

       mov       esi,[points_r_ptr]
       mov       edi,.pts_ptr
       mov       ecx,.pcnt ;[points_count_var]
       lea       ecx,[ecx*3]
       cld
       rep       movsd
       mov       .curr_p_ptr,edi
     @@:

       mov       esi,[sketch_buff_ptr]
       cld
       pop       ecx
     .loop_scat:
    ;  mov       eax,[esi]
       lodsd
       push      ecx
       xor       ecx,ecx
     .llsc:
       movlps    xmm0,.xy_cnt
       movlps    xmm1,.one
       paddd     xmm0,xmm1
       movss     .xy_cnt,xmm0
       xor       ebx,ebx
       bt        eax,ecx
       jc        .insert_object
     .nx_in_sc:
       inc       ecx
       cmp       ecx,32
       jne       .llsc
       xor       ecx,ecx
       mov       [esi-4],ecx
   ;   add       esi,4
       pop       ecx
       loop      .loop_scat
       jmp       .mjob

    .insert_object:
       push      esi edi ecx
       mov       esi,tetra_points_r
       mov       edi,.curr_p_ptr
       cld
       mov       eax,.xy_cnt
       cdq
       movzx     ebx,[xres_var]
       div       ebx
       ; eax = y
       ; edx = x
     ;  shr       eax,1
     ;  shr       edx,1
       push      eax edx
       xorps     xmm0,xmm0
       cvtpi2ps  xmm0,[esp]
       add       esp,8

       mov       eax,10
       cvtsi2ss  xmm1,eax
       mov       ecx,4      ; 4 verts
       shufps    xmm1,xmm1,0
       rcpss     xmm3,[scale]
       shufps    xmm3,xmm3,0
       mulps     xmm0,xmm3
       mulps     xmm1,xmm3
     @@:
       movups    xmm2,[esi]
       mulps     xmm2,xmm1
       addps     xmm2,xmm0
       movups    [edi],xmm2
       add       edi,12
       add       esi,12
       loop      @b

       mov       edi,.curr_p_ptr
       mov       esi,edi
       mov       ecx,4
       lea       ebx,[matrix] ;.mx
       call      rotary
       mov       .curr_p_ptr,edi

;       xorps     xmm0,xmm0
;       cvtpi2ps  xmm0,[esp]
;       add       esp,8
;       sub       edi,48
;       mov       ecx,4      ; 4 verts
;     @@:
;       movups    xmm2,[edi]
;       addps     xmm2,xmm0
;       movlps    [edi],xmm2
;       movhlps   xmm2,xmm2
;       movss     [edi+8],xmm2
;       add       edi,12
;       loop      @b
       mov       esi,tetra_triangles
       mov       edi,.curr_t_ptr
       mov       ecx,12
     @@:
       lodsb
       movzx     eax,al
       add       eax,.pcnt
       stosd
       loop      @b
       mov       .curr_t_ptr,edi
       add       .tcnt,4
       add       .pcnt,4
       pop       ecx edi esi
       jmp       .nx_in_sc
     .mjob:
       mfree     [points_r_ptr]
       mfree     [triangles_ptr]
       mov       eax,.pts_ptr
       mov       ebx,.tris_ptr
       mov       ecx,.tcnt
       mov       edx,.pcnt
       mov       [points_r_ptr],eax
       mov       [triangles_ptr],ebx
       mov       [triangles_count_var],ecx
       mov       [points_count_var],edx
     .end:
       mov       esp,ebp
       pop       ebp
ret
if 0
find_v_sketch:
; find sketched vertices
       .tri_counter equ dword [ebp-4]
       .mrk_chu_ptr equ dword [ebp-8]
       .chu_mrk_cnt equ word  [ebp-10]
       .marker      equ byte  [ebp-11]
       .xy_cnt      equ       [ebp-15]
       .one         equ       [ebp-19]

       push      ebp
       mov       ebp,esp
       sub       esp,19
       xor       eax,eax
       mov       .tri_counter,eax
       mov       .chu_mrk_cnt,ax
       mov       .marker,al
       mov       .xy_cnt,eax
       mov       .one,dword 1
       mov       eax,4000+16
       malloc    eax
       mov       .mrk_chu_ptr,eax
       ; max 1000 marked chunks
       cld
       ; draw scatch buff
       mov       edi,[Zbuffer_ptr]
       mov       esi,[sketch_buff_ptr]
       or        esi,esi
       jz        .end

       movzx     ecx,[xres_var]
       movzx     ebx,[yres_var]
       imul      ecx,ebx
       shr       ecx,3 + 2
       inc       ecx
     .loop_scat:
       lodsd
       push      ecx
       xor       ecx,ecx
     .llsc:
       movlps    xmm0,.xy_cnt
       movlps    xmm1,.one
       paddd     xmm0,xmm1
       movss     .xy_cnt,xmm0
       xor       ebx,ebx
       bt        eax,ecx
       jc        .checkZ
     .nx_in_sc:
       add       edi,4
       inc       ecx
       cmp       ecx,32
       jne       .llsc
       pop       ecx
       loop      .loop_scat
       inc       .marker
       jmp       .no_ch
  .checkZ:
       ; check z cooef
       push      esi
       push      ecx
       push      edi
       mov       eax,.xy_cnt
       cdq
       movzx     ebx,[xres_var]
       div       ebx
       ; eax = y
       ; edx = x
       push      eax edx
       cvtpi2ps  xmm0,[esp]
       add       esp,8

       movhps    xmm0,[edi]
       mov       esi,[triangles_ptr]
       xor       ecx,ecx
     .ll:
       lodsd
       xchg      eax,ebx
       lodsd
       xchg      eax,edx
       lodsd
       push      esi
       imul      eax,12
       imul      ebx,12
       mov       esi,[points_rotated_ptr]
       imul      edx,12
       movups    xmm7,[esi+eax]
       movups    xmm6,[esi+ebx]
       movups    xmm5,[esi+edx]
       movaps    xmm4,xmm7
       maxps     xmm7,xmm6
       maxps     xmm7,xmm5
       minps     xmm4,xmm6
       minps     xmm4,xmm5
       ; chck if z coord is in min-max box
       cmpltps   xmm7,xmm0
       cmpltps   xmm4,xmm0
       xorps     xmm7,xmm4
       movmskps  eax,xmm7
       and       eax,111b
       cmp       eax,111b
       je       .found_t
     .nx_t:
       pop       esi
       inc       ecx
       cmp       ecx,[triangles_count_var]
       jne       .ll
     .t_end:
       pop       edi
       pop       ecx
       pop       esi
       jmp       .nx_in_sc

    .found_t:
       ; marker = 0 => mark chunks

       mov       eax,ecx
       add       eax,eax
       add       eax,[chunks_ptr]
       movzx     eax,word[eax]
       mov       edx,.mrk_chu_ptr
       xor       ebx,ebx
       xor       edi,edi
     @@:
       inc       ebx
       cmp       [edx],eax
       cmove     edi,ebx
       add       edx,4
       cmp       bx,.chu_mrk_cnt
       jb        @b
       pop       esi
       or        edi,edi
       ; is chun in list?
       jnz       .t_end
       movzx     edx,.chu_mrk_cnt
       shl       edx,2
       add       edx,.mrk_chu_ptr
       mov       [edx],eax
       cmp       .chu_mrk_cnt,990
       ; is chu no too big?
       ja        .t_end
       inc       .chu_mrk_cnt
       jmp       .t_end
     .no_ch:
       ; Transfer all vertices from
       ; selected chunks into separate list.
       ; Take sketch vectors into account.
       ; .marker = 1
     .tr:
       mov       esi,.mrk_chu_ptr
       xor       edx,edx
       movzx     ecx,.chu_mrk_cnt
       or        ecx,ecx
       jz        .end
       cld
       cvtdq2ps  xmm0,[xxadd]
    .nx_ch:
       lodsw
       movzx     eax,ax
       shl       eax,4
       add       eax,[chunks_desc_ptr]
       ; + 0 = count of tris in chunk
       ; + 4 = count of vertices in chunk
       ; + 8 = vertices offset
       ; + 12 = triangles_offset
       cmp       .marker,2
       ; .marker = 2 => trensfer verts
       jne       .count
       push      ecx esi
       mov       esi,[eax+8]
       mov       ecx,[eax+4]
  ;     add       ecx,2
       mov       edx,esi
       or        esi,esi
       jz        .skip
;       inc       esi
;       dec       esi
       imul      esi,12
       add       esi,[points_rotated_ptr]
   ;    movss     xmm2,[scale]
   ;    shufps    xmm2,xmm2,0
     @@:
       movups    xmm1,[esi]
  ;     mulps     xmm1,xmm2
       subps     xmm1,xmm0
       ; translate backward
       movups    [edi],xmm1
       add       edi,12
       add       esi,12
     ;  movsd
     ;  movsd
     ;  movsd
       mov       [ebx],edx
       ; vert index
       add       ebx,4
       inc       edx
       loop      @b
     .skip:
       pop       esi ecx
     .count:
       add       edx,[eax+4]
       ; not importand if .marker = 2, == transfer
       loop     .nx_ch
       cmp      .marker,2
       je       .end
       ; edx = all sketch verts count

       mov      [sketch_verts_No],edx
       add      edx,1000
       push     edx
       imul     edx,12
       malloc   edx
       mov      [sketch_verts_ptr],edx
       pop      edx
       shl      edx,2
       malloc   edx
       mov      [sketch_verts_indices_ptr],eax
       mov      .marker,2
       mov      edi,[sketch_verts_ptr]
       mov      ebx,eax
       jmp      .tr
     .end:
       ; verts marked as sketched
       ; (so verts from sketched chunks)
       ; are in [sketch_verts_ptr] list
       ; verts indices in [sketch_verts_indices] list
       ; number of verts = [sketch_verts_No]
       mfree   .mrk_chu_ptr
       mov      esp,ebp
       pop      ebp
ret
end if
;=====================================================
rem_tris_not_inner_vert:
; Remove all tris from selected chunk with NO inner
; verts. Other words delete front tris from particular
; chunk.
; in:
;   ecx = ch no
       push   ebp
       mov    ebp,esp
       sub    esp,16
       .triangles_ptr equ [ebp-4]
       .pts_r         equ [ebp-8]
       .tcv           equ [ebp-12]
       .inv_ptr       equ [ebp-16]

       cld
       lea    esi,[triangles_count_var]
       lea    edi,.tcv
       movsd
       movsd
       movsd
       mov    eax,[inner_vert_ptr]
       mov    .inv_ptr,eax
       mov    edx,ecx
       cld
       xor    ecx,ecx
       mov    esi,[chunks_ptr]
    @@:
       lodsw
       cmp    ax,dx
       je     .chck_tri
    .nxC:
       inc    ecx
       cmp    ecx,.tcv
       jnz    @b
       jmp    .en1
     .chck_tri:
       mov    edi,ecx
       ; check triangle
       imul   edi,12
       add    edi,.triangles_ptr
       push   ecx
     .ll:
       mov    ecx,3
       movups xmm1,[edi]
     .b:
       movd   eax,xmm1
       mov    ebx,eax
       shr    eax,3
       and    ebx,111b
       add    eax,.inv_ptr
       bt     [eax],ebx
       jnc    .f
       psrldq xmm1,4
       loop   .b
       xor    eax,eax
       ; mark tri as outside
       stosd
       stosd
       stosd
     .f:
       pop    ecx
       jmp    .nxC
     .en1:
       mov     esp,ebp
       pop     ebp
ret
;=======================================================
rem_TIV_all:
; remove triangles, all of its verts
; are marked as inside
       push   ebp
       mov    ebp,esp
       sub    esp,12

       .triangles_ptr equ [ebp-4]
       .pts_r         equ [ebp-8]
       .tcv           equ [ebp-12]
       cld
       lea    esi,[triangles_count_var]
       lea    edi,.tcv
       movsd
       movsd
       movsd
       mov    edx,[inner_vert_ptr]
       mov    esi,.triangles_ptr
       xor    ecx,ecx
     .ll:
       movd   xmm0,ecx
       mov    ecx,3
       movups xmm1,[esi]
     .b:
       movd   eax,xmm1
       mov    ebx,eax
       shr    eax,3
       and    ebx,111b
       bt     [eax+edx],ebx
       jc     .f
       psrldq xmm1,4
       loop   .b
       movd   edi,xmm0
       imul   edi,12
       add    edi,.triangles_ptr
       xor    eax,eax
       stosd
       stosd
       stosd
     .f:
       movd   ecx,xmm0
       add    esi,12
       inc    ecx
       cmp    ecx,.tcv
       jnz    .ll
     .en1:
       mov     esp,ebp
       pop     ebp
ret
;===============================================================
draw_wave:
; in:
;   edx = ptr to data
;   eax = ptr to data end
;   xmm0 = lo -> hi, dwords int: y res, x res, screen ptr
     push       ebp
     mov        ebp,esp
     .ptr       equ  dword [ebp-4]
     .ptr_end   equ  dword [ebp-8]
     .big       equ        [ebp-12]
     .small     equ        [ebp-16]
     .scr       equ        [ebp-24]
     .xres      equ        [ebp-28]
     .yres      equ        [ebp-32]
     .middle    equ        [ebp-48]
     .factor    equ        [ebp-64]
     .trans     equ        [ebp-80]
     .scr2      equ dword  [ebp-84]      ;  \
     .zbuff     equ dword  [ebp-88]      ;  |  > dont xchg
     .tex_ptr   equ dword  [ebp-92]      ;  |    order !!
     .width     equ        [ebp-96]      ; /

     sub        esp,112
     and        ebp,-16

     mov       .ptr,edx             ; init
     mov       .ptr_end,eax         ; local
     mov       eax,200000           ; variables
     mov       .big, eax            ;
     neg       eax                  ;
     mov       .small,eax
     movups    .yres,xmm0
  ;  mov       eax,.scr
  ;  mov       .tex_ptr,0x00ffff    ; color
     shufps    xmm0,xmm0,11110001b
     movss     .width,xmm0
  ;  mov       .scr2,eax

     mov       esi,edx
     cld
     cvtpi2ps  xmm3,.big
     cvtpi2ps  xmm4,.small
     ; find min / max
   .l1:
     lodsw
     cwde
     cvtsi2ss  xmm1,eax
     maxps     xmm4,xmm1
     minps     xmm3,xmm1
     cmp       esi,.ptr_end
     jb        .l1
     movaps    xmm2,xmm3
     addps     xmm2,xmm4
     divps     xmm2,[f2x4]

     mov       ebx,.yres
     shr       ebx,1

     mov       esi,edx
     cvtsi2ss  xmm4,ebx
     lea       ebx,[ebx*3]
     shr       ebx,2
     cvtsi2ss  xmm3,ebx
     cld
     mov       edi,.scr
     mov       ebx,.xres
     shl       ebx,2
     add       ebx,edi

   ;  shufps    xmm2,xmm2,0
   ;  shufps    xmm3,xmm3,0
   ;  shufps    xmm4,xmm4,0

  ;   movups    .middle,xmm2
  ;   movups    .factor,xmm3
  ;   movups    .trans,xmm4

     mov       edi,.xres
     imul      edi,.yres
     shl       edi,2
     add       edi,.scr
     sub       edi,16
     mov       esi,.ptr
     xor       ecx,ecx
     pcmpeqd   xmm7,xmm7
     rcpps     xmm1,xmm2 ;.middle
   .ll:
     xorps     xmm6,xmm6
     push      esi
     lodsw
     cwde
     cvtsi2ss  xmm0,eax
     mov       eax,ecx
     inc       ecx
     cvtpi2ps  xmm5,.yres
     push      ecx
     subps     xmm0,xmm2 ;.middle
     mulps     xmm0,xmm1

     mulps     xmm0,xmm3 ;.factor
  ;  shufps    xmm5,xmm5,0
     addps     xmm0,xmm4 ;.trans

     minps     xmm0,xmm5
     maxps     xmm0,xmm6
     cvtss2si  ebx,xmm0
   ; cvtps2dq  xmm0,xmm0
   ; movd      ebx,xmm0
     imul      ebx,.width
     add       ebx,eax
     shl       ebx,2
     add       ebx,.scr
     movups    [ebx],xmm7

     pop       ecx
     pop       esi
     cmp       ecx,.xres

     add       esi,2
     cmp       esi,.ptr_end
     jb        .ll
   @@:
     add       esp,112
     pop       ebp
ret

;===========================================================
morph_anim:
    push        ebp
    mov         ebp,esp
    sub         esp,79
    and         ebp,-16
    .multip     equ [ebp-16]
    .tet1       equ [ebp-32]
    .tet2       equ [ebp-48]
    .tet3       equ [ebp-64]

    mov      edi,tetra_points_r
    movups   xmm0,[edi]
    movups   xmm1,[edi+12]
    movups   xmm2,[edi+24]
    movaps   .tet1,xmm0
    movaps   .tet2,xmm1
    movaps   .tet3,xmm2

    pcmpeqd  xmm6,xmm6
    inc      [to_pieces_var]
    xorps    xmm3,xmm3
    mov      eax,0.00004
    pslld    xmm6,31
    cvtsi2ss xmm3,[to_pieces_var]
    movd     xmm7,eax
    mov      edi,[points_r_ptr]
    shufps   xmm3,xmm3,11000000b
    shufps   xmm7,xmm7,0
    movups   .multip,xmm3
    mov      esi,[triangles_ptr]
    mov      ecx,[triangles_count_var]
    cld
  .ll:
    lodsd
    xchg     eax,edx
    lodsd
    xchg     eax,ebx
    lodsd
    imul     edx,12
    imul     ebx,12
    imul     eax,12
    movups   xmm3,[edi+eax]
    movups   xmm4,[edi+ebx]
    movups   xmm5,[edi+edx]
    movaps   xmm0,xmm3
    movaps   xmm1,xmm4
    movaps   xmm2,xmm5
    subps    xmm3,.tet1
    subps    xmm4,.tet2
    subps    xmm5,.tet3
    xorps    xmm3,xmm6  ; xchg sign
    xorps    xmm4,xmm6
    xorps    xmm5,xmm6
    mulps    xmm3,xmm7
    mulps    xmm4,xmm7
    mulps    xmm5,xmm7
    mulps    xmm3,.multip
    mulps    xmm4,.multip
    mulps    xmm5,.multip
    addps    xmm3,xmm0
    addps    xmm4,xmm1
    addps    xmm5,xmm2
    movups   [edi+eax], xmm3
    movups   [edi+ebx], xmm4
    movups   [edi+edx], xmm5
    loop     .ll
    add      esp,79
    pop      ebp
ret
;============================================================
make_bumps:
; in eax = position
;    ebx = direction
    push       ebp

    mov        ebp,esp
    sub        esp,176
    sub        ebp,36
    and        ebp, - 16
    .mx        equ [ebp-36]
    .radius    equ [ebp-40]
    .Zbuffer   equ [ebp-44]
    .margin    equ [ebp-48]
    .point     equ [ebp-64]
    .ref_pt    equ [ebp-80]   ; reference point
    .abs_msk   equ [ebp-96]
    .abc       equ [ebp-112]  ; elypsoid halfaxes
    .xres      equ [ebp-116]
    .pts_r_ptr equ [ebp-120]  ; [points_rotated_ptr]
    .pts       equ [ebp-124]  ; [points_r_ptr]
    .xadd      equ [ebp]
    .dir_msk   equ [ebp+16]
    .one       equ [ebp+32]


    pcmpeqd    xmm3,xmm3
    psrld      xmm3,1
    movaps     .abs_msk,xmm3
    pslld      xmm3,31
    or         bx,bx
    jl         @f
    xorps      xmm3,xmm3
   @@:
    movaps     .dir_msk,xmm3  ; bump convex or concave, sub or add

    mov        esi,1
    mov        edi,4
    cvtdq2ps   xmm5,[xxadd]
    movlps     xmm2,[NextMxadd]
    movlps     xmm3,[NextMsub]
    mov        edx,2 shl 16 + 2
    mov        .margin,edi
    movd       xmm4,edx
    mov        edi,[points_r_ptr]
    mov        .one,esi
    movaps     .xadd,xmm5

    movzx      ecx,[xres_var]
    movd       xmm0,eax
    mov        .pts,edi
    mov        .xres,ecx
    xorps      xmm1,xmm1
    punpcklwd  xmm0,xmm1
    mov        eax,[Zbuffer_ptr]
    mov        ebx,4
    punpcklwd  xmm2,xmm1
    punpcklwd  xmm3,xmm1
    punpcklwd  xmm4,xmm1         ; elyposoid
    mov        .Zbuffer,eax      ; (x/a)^2+(y/b)^2+(z/c)^2=1
    psubd      xmm2,xmm3
    paddd      xmm2,xmm4
    cvtsi2ss   xmm4,ebx
    shufps     xmm4,xmm4,0
    rcpps      xmm4,xmm4 ;[const4]
    cvtdq2ps   xmm0,xmm0
    cvtdq2ps   xmm2,xmm2
    mulps      xmm2,xmm4
    mulps      xmm2,xmm4
    movups     .ref_pt,xmm0
    rcpps      xmm2,xmm2
    movups     .abc,xmm2
    mov        esi,matrix_scaled
    lea        edi,.mx
    call       reverse_mx_3x3

    mov        esi,[points_rotated_ptr]
    mov        .pts_r_ptr,esi
    mov        edx,30                 ; bump radius
    cvtsi2ss   xmm1,edx
    movss      .radius,xmm1
    mov        ecx,[points_count_var]
    cld
  .ll:
    push       ecx
    push       esi
    movups     xmm2,[esi]
    movaps     xmm4,xmm2
    subps      xmm2,.ref_pt
    mulps      xmm2,.abc
    mulps      xmm2,xmm2
    haddps     xmm2,xmm2
    sqrtps     xmm3,xmm2

    movhlps    xmm5,xmm4
    movhlps    xmm7,xmm4
    sub        esp,8
    cvtps2dq   xmm6,xmm4
    movlps     [esp],xmm6
    pop        eax edx
    comiss     xmm3,.radius
    ja         .nxx

    imul       edx,.xres
    add        eax,edx
    shl        eax,2
    add        eax,.Zbuffer
    cvtpi2ps   xmm6,.margin  ; Z coord margin
    addps      xmm5,xmm6
    subps      xmm7,xmm6
    cmpltss    xmm7,[eax]
    cmpltss    xmm5,[eax]
    xorps      xmm7,xmm5
    movmskps   eax,xmm7
    test       eax,.one
    jz         .nxx
    push       esi
    ; xmm2 = bump position
    movlps     xmm5,.radius
    mulps      xmm5,xmm5
    subps      xmm2,xmm5
    andps      xmm2,.abs_msk
    sqrtps     xmm2,xmm2

  ;  movaps     xmm6,xmm4
  ;  mov        eax,NEXT_CURV_NUMB
  ;  subps      xmm6,xmm5
  ;  cvtsi2ss   xmm7,eax
  ;  mulps      xmm6,.abc
  ;  mulps      xmm6,xmm7
  ;  cvtss2si   edi,xmm6
  ;  shl        edi,2
  ;  add        edi,NextBendDerv
  ;  movlps     xmm7,[edi]
  ;  addps      xmm2,xmm7

    pslldq     xmm2,8
    lea        esi,.point
    subps      xmm4,.xadd
    xorps      xmm2,.dir_msk ; bump convex or concave
    addps      xmm4,xmm2
    ; xm4 = modifed point
    ; lets transfer it to points_r_ptr
    mov        edi,esi
    movups     [esi],xmm4
    lea        ebx,.mx
    mov        ecx,.one
    call       rotary
    pop        edi
    sub        edi,.pts_r_ptr
    add        edi,.pts
    lea        esi,.point
    movsd
    movsd
    movsd
  .nxx:
    pop        esi ecx
    add        esi,12
    dec        ecx
    jnz        .ll
;   loop       .ll
    add        esp,176
    pop        ebp
ret
;===========================================
to_pieces_anim:

   .vect    equ [ebp-16]
   .cos     equ [ebp-20]
   .sin     equ [ebp-24]
   .mx      equ [ebp-60]
   .pts     equ [ebp-(64+12*3)]
   .mid_val equ [ebp-116]
   .xesi    equ [ebp-120]
   .one     equ [ebp-124]

   .triangles_normals_ptr         equ [ebp]
   .triangles_normals_rotated_ptr equ [ebp+4]
   .points_normals_ptr            equ [ebp+8]
   .points_count_var              equ [ebp+12]
   .triangles_count_var           equ [ebp+16]
   .points_r_ptr                  equ [ebp+20]
   .triangles_ptr                 equ [ebp+24]
   .points_rotated_ptr            equ [ebp+28]
   .points_normals_rotated_ptr    equ [ebp+32]
   .rotary                        equ dword [ebp+36]
   .tri_part                      equ xword [ebp+52]
;  .ptr                           equ dword [ebp+68]

    push     ebp
    mov      ebp,esp
    sub      ebp,80
    sub      esp,250
    mov      esi,triangles_normals_ptr
    lea      edi,.triangles_normals_ptr
    cld
    mov      ecx,9
    rep      movsd
    inc      [to_pieces_var]
    mov      .one,dword 1
    mov      .rotary,dword rotary
    mov      eax,0.33333
    movd     xmm4,eax
    shufps   xmm4,xmm4,0
    movups   .tri_part,xmm4

    mov      esi,.triangles_ptr
    mov      ebx,.points_r_ptr  ;  _rotated_ptr
    mov      edi,.triangles_normals_rotated_ptr
    mov      eax,-0.000028
    mov      edx,10000.3
    mov      ecx,.triangles_count_var
    xorps    xmm0,xmm0
    movd     xmm0,eax
    movd     xmm2,edx

    shufps   xmm2,xmm2,0
    cvtsi2ss xmm3,[to_pieces_var]
    shufps   xmm3,xmm3,0
    shufps   xmm0,xmm0,11110011b
    mulps    xmm0,xmm3
    mulps    xmm0,xmm3
    mulps    xmm0,xmm3
    movss    xmm1,[scale]
    shufps   xmm1,xmm1,0
  .lp_add:
    mov      ebx,.points_r_ptr
    cld
    lodsd
    imul     eax,12
    push     eax
    movups   xmm7,[eax+ebx]
    lodsd
    imul     eax,12
    push     eax
    movups   xmm6,[eax+ebx]
    lodsd
    imul     eax,12
    push     eax
    movups   xmm5,[eax+ebx]

    mulps    xmm5,xmm1
    mulps    xmm6,xmm1
    mulps    xmm7,xmm1

    movd     xmm4,eax
    shufps   xmm4,xmm4,0
    minps    xmm4,xmm2     ; random factor
    subps    xmm0,xmm4

    movups   xmm4,[edi]    ; normal job
    mulps    xmm4,xmm3
    subps    xmm4,xmm0

    addps    xmm7,xmm4     ; move 'down'
    addps    xmm6,xmm4
    addps    xmm5,xmm4

    mov      ebx,.points_rotated_ptr
   ; update points
    pop      eax

    movlps   [eax+ebx],xmm5
    movhlps  xmm5,xmm5
    movss    [eax+ebx+8],xmm5
    pop      eax
    movlps   [eax+ebx],xmm6
    movhlps  xmm6,xmm6
    movss    [eax+ebx+8],xmm6
    pop      eax
    movlps   [eax+ebx],xmm7
    movhlps  xmm7,xmm7
    movss    [eax+ebx+8],xmm7
    add      edi,12
  ;  dec      ecx
  ;  jnz      .lp_add
    loop    .lp_add


    mov      eax,0.04
    movd     xmm0,eax
    mulps    xmm0,xmm3
    movd     eax,xmm0
    push     eax

    fninit
    fld      dword[esp]
    fsincos
    fstp     dword .cos
    fstp     dword .sin
    pop      eax

if 0
   mov      eax,[points_count_var]
   shr      eax,5
   cmp      eax,10
   jna      @f
   add      eax,10
   imul     eax,12
   malloc   eax
   mov      .ptr,eax
 @@:
   xor      ecx,ecx
   mov      esi,.triangles_ptr
   push     ecx
   and      ecx,11111b
   cmp      ecx,1
   jne      .en_la
   lodsd
   imul     eax,12
   push     eax
   mov      edi,.points_rotated_ptr
   movups   xmm0,[eax+edi]
   lodsd
   imul     eax,12
   movups   xmm1,[eax+edi]
   push     eax
   lodsd
   imul     eax,12
   movups   xmm2,[eax+edi]
end if

;   mov      edi,.triangles_normals_ptr
    mov      esi,.triangles_ptr
    mov      ecx,.triangles_count_var
 .rot1:
    push     ecx
;   push     edi
    lodsd
    imul     eax,12
    push     eax
    mov      edi,.points_rotated_ptr
    movups   xmm0,[eax+edi]
    lodsd
    imul     eax,12
    movups   xmm1,[eax+edi]
    push     eax
    lodsd
    imul     eax,12
    movups   xmm2,[eax+edi]
    push     eax

    mov      .xesi,esi
    lea      edi,.pts
    movups   xmm4,.tri_part
    movaps   xmm3,xmm0
    shufps   xmm4,xmm4,0
    addps    xmm3,xmm1
    addps    xmm3,xmm2
    mulps    xmm3,xmm4
    subps    xmm0,xmm3
    subps    xmm1,xmm3
    subps    xmm2,xmm3
    lea      edi,.pts
    movups   [edi],xmm0
    movups   [edi+12],xmm1
    movups   [edi+24],xmm2
    movups   .mid_val,xmm3

    ror      esi,11
    ror      ecx,4
    xor      esi,ecx

    movd     xmm4,esi
    shufps   xmm4,xmm4,0

    mulps    xmm0,xmm4 ; random vector

    subps    xmm0,xmm1
    movaps   xmm2,xmm0
    dpps     xmm0,xmm0,01110111b
    rsqrtps  xmm0,xmm0
    mulps    xmm2,xmm0
    movups   .vect,xmm2

    lea      esi,.sin
    lea      ebx,.vect
    lea      edi,.mx
    call     make_arbitrary_mx

    lea      esi,.pts
    mov      edi,esi
    lea      ebx,.mx
    mov      ecx,3
    call     .rotary

    pop      esi ecx edx
    lea      edi,.pts
    movups   xmm0,[edi]
    movups   xmm1,[edi+12]
    movups   xmm2,[edi+24]
    cvtdq2ps xmm4,[xxadd]

    movups   xmm3,.mid_val
    addps    xmm3,xmm4
    addps    xmm0,xmm3
    addps    xmm1,xmm3
    addps    xmm2,xmm3
    movhlps  xmm4,xmm0
    movhlps  xmm5,xmm1
    movhlps  xmm6,xmm2
    mov      edi,.points_rotated_ptr
    movlps   [edx+edi],xmm0
    movlps   [ecx+edi],xmm1
    movlps   [esi+edi],xmm2
    movss    [edx+8+edi],xmm4
    movss    [ecx+8+edi],xmm5
    movss    [esi+8+edi],xmm6

    push     edx ecx

    add      esi,.points_normals_rotated_ptr
    lea      ebx,.mx
    mov      ecx,.one
    mov      edi,esi
    call     .rotary
    pop      esi
    add      esi,.points_normals_rotated_ptr
    mov      edi,esi
    lea      ebx,.mx
    mov      ecx,.one
    call     .rotary
    pop      esi
    lea      ebx,.mx
    mov      ecx,.one
    add      esi,.points_normals_rotated_ptr
    mov      edi,esi
    call     .rotary

  ; pop      edi
    pop      ecx
  ; add      edi,12
    mov      esi,.xesi
    dec      ecx
    jnz      .rot1
 ;  loop     .rot1

    add      esp,250
    pop      ebp
ret
;==================================================
to_pieces:
   .new_pts equ [ebp-4]
   .tcv3    equ [ebp-8]

   push     ebp
   mov      ebp,esp
   sub      esp,12
   xor      eax,eax

   mov      ecx,[triangles_count_var]
   mov      [to_pieces_var],eax
   push     ecx
   add      ecx,100
   imul     ecx,36
   malloc   ecx
   mov      .new_pts,eax
   pop      ecx
   lea      ecx,[ecx*3]
   mov      .tcv3,ecx

   mov      ebx,[points_r_ptr]
   mov      edi,eax
   mov      esi,[triangles_ptr]
   cld
  @@:
   lodsd
   imul     eax,12
   xchg     eax,esi
   add      esi,ebx
   movsd
   movsd
   movsd
   xchg     eax,esi
   loop     @b

   mfree    [points_r_ptr]
   mov      eax,.new_pts
   mov      [points_r_ptr],eax
   mov      edi,[triangles_ptr]
   mov      ecx,.tcv3
   mov      [points_count_var],ecx
   xor      eax,eax
   cld
  @@:
   stosd
   inc      eax
   loop     @b

   mov      esp,ebp
   pop      ebp
ret
;=========================================================
rem_tri:
  ; remove single triangle
  ; eax = no of tri to remove
  mov      esi,[triangles_ptr]
  imul     eax,12
  xor      ebx,ebx
  mov      [eax+esi],ebx
  mov      [eax+esi+4],ebx
ret
;==========================================================
make_series:
;  clone/copy current  object
    push        ebp
    mov         ebp,esp
    sub         esp,28

    .p_ptr               equ [ebp-28]
    .tris_ptr            equ [ebp-24]
    .points_count_var    equ dword[ebp-20]
    .triangles_count_var equ dword[ebp-16]
    .points_r_ptr        equ dword[ebp-12]
    .triangles_ptr       equ dword[ebp-8]
    .points_rotated_ptr  equ dword[ebp-4]

    lea         edi,.points_count_var
    lea         esi,[points_count_var]
    cld
    movsd
    movsd
    movsd
    movsd
    movsd

    mov         ebx,.points_count_var
    lea         ebx,[ebx*3]
    shl         ebx,3
    add         ebx,1000
    malloc      ebx
    mov         .p_ptr,eax

    mov         ebx,.triangles_count_var
    lea         ebx,[ebx*3]
    shl         ebx,3
    add         ebx,1000
    malloc      ebx
    mov         .tris_ptr,eax
    mov         edi,eax
    mov         esi,.triangles_ptr
    mov         ecx,.triangles_count_var
    lea         ecx,[ecx*3]
    push        ecx
    push        esi
    cld
    rep         movsd
    pop         esi
    pop         ecx
    mov         edx,.points_count_var    ; make tris copy
   @@:
    lodsd
    add         eax,edx
    stosd
    loop        @b
    movzx       ebx,[tolerancy_flag]
    cvtsi2ss    xmm6,ebx
    mulss       xmm6,[scale]
    mov         eax,0.3
    movd        xmm7,eax
    mulss       xmm7,xmm6
    mov         ecx,.points_count_var
    push        ecx
    lea         ecx,[ecx*3]
    mov         esi,.points_rotated_ptr
    push        esi
    mov         edi,.p_ptr
    rep         movsd
    pop         esi
    pop         ecx
  .b1qq:
    lodsd
    movd        xmm0,eax
    addps       xmm0,xmm7
    movd        eax,xmm0
    stosd
    movsd
    movsd
    loop        .b1qq

    mfree       [points_r_ptr]
    mfree       [triangles_ptr]

    mov         eax,[points_count_var]
    add         eax,eax
    mov         [points_count_var],eax
    mov         eax,[triangles_count_var]
    add         eax,eax
    mov         [triangles_count_var],eax

    mov         ebx,.p_ptr
    mov         edx,.tris_ptr
    mov         [points_r_ptr],ebx
    mov         [triangles_ptr],edx

    mov         esp,ebp
    pop         ebp
ret
;============================================================
calc_combo:
    push      ebp
    mov       ebp,esp
    ; Calc all parameters - only triangles list and points list are provided as input.
    ; I want do some pre calculation (chunks, non tri rem, normals bumps/texture
    ; calcs ) quick and simply after all editing routines.
    ; Main goal is unify calculations and prevent bugs, wich causes variety of sub
    ; calculation options.
    .mem_sign equ dword [ebp-4]
 ;   .tri_ch   equ [ebp-8]
 ;   .t_ptr    equ [ebp-12]

     push     eax
     call     remove_non_tri
     mov      eax,[triangles_ptr]
     mov      ebx,[points_r_ptr]
     mov      ecx,[triangles_count_var]
     mov      edx,[points_count_var]
     call     remove_unused_vertices
     mov      [points_count_var],ecx
     cmp      .mem_sign,'firs'
     je       .no_mem_work
     mov      eax,33       ; not free tri and vert
     call     free_mem_for_tp
     xor      eax,eax      ; not alloc for tri and vert
     call     alloc_mem_for_tp
 .no_mem_work:
     mov      edi,[points_r_ptr]
     mov      ecx,[points_count_var]
     call     normalize_object
     call     init_triangles_normals
     call     init_point_normals

     xor      eax,eax
     call     detect_chunks
     mov      [chunks_ptr],ebx
     mov      [chunks_count],ecx

  ;   call     do_edges_list    ; find [greatest_chunk]
     call     remove_non_tri
     mov      eax,[triangles_ptr]
     mov      ebx,[points_r_ptr]
     mov      ecx,[triangles_count_var]
     mov      edx,[points_count_var]
     call     remove_unused_vertices
     mov      [points_count_var],ecx
     call     sort_chunks
  ;   call     opt_chunks
     ; opt as default

     call     do_edges_list
     mov      eax,[tex_scale]
     call     calc_bumpmap_coords
  ;   mov      al,2
  ;   call     re_alloc_stenc_shadows
     mov      esp,ebp
     pop      ebp
ret
;========================================================================
if 0
aabb_inter:  ; axis aligned bounding box intersection
; in:
;       xmm0 - ray direction
;       xmm1 - ray origin
;       xmm2 - box min x,y,z
;       xmm3 - box max x,y,z
; out:
;       eax = 0 -> ray intersects box
;       eax = 0xffffff -> no intersection
       push     ebp
       mov      ebp,esp
       and      ebp,-16
       sub      esp,120

       .dir     equ [ebp-16]
       .dirz    equ [ebp-8]
       .origin  equ [ebp-32]
       .box_min equ [ebp-48]
       .box_max equ [ebp-64]

       .tmin    equ [ebp-80]
       .tymin   equ [ebp-76]
       .tzmin   equ [ebp-72]

       .tmax    equ [ebp-96]
       .tymax   equ [ebp-92]
       .tzmax   equ [ebp-88]


       rcpps    xmm7,xmm0
       movaps   .box_min,xmm2
       movaps   .box_max,xmm3
       movaps   .dir,xmm0
       movaps   .origin,xmm1
       movaps   xmm6,xmm2
       movaps   xmm5,xmm3
       subps    xmm6,xmm0
       subps    xmm5,xmm0
       mulps    xmm6,xmm7    ; xmm6 -  min
       mulps    xmm5,xmm7    ; xmm5  - max
       movaps   xmm0,xmm6
       minps    xmm6,xmm5
       maxps    xmm5,xmm0
       movaps   .tmin,xmm6
       movaps   .tmax,xmm5    ;  cmpnltss
       cmpnltss xmm6,.tymax  ;tmin > tymax ; 11 if grt
       cmpless  xmm5,.tymin  ;tmax <= tymin ; 11 if grt
       orps     xmm6,xmm5
;      movd     eax,xmm6
;      or       eax,eax
;      jnz      @f            ; false

       movss    xmm0,.tmin
       movss    xmm1,.tmax
       maxss    xmm0,.tymin
       minss    xmm1,.tymax

       cmpnltss xmm0,.tzmax  ; tmin >  tzmax
       cmpless  xmm1,.tzmin  ; tmax <= tzmin
       orps     xmm0,xmm1
       orps     xmm0,xmm6
       movd     eax,xmm0

       add      esp,120
       pop      ebp
ret
end if
;====================================================================
mark_colided_edges:
; Proc has function of smothing inside edges also.
;   if eax = 1,2 smooth inside edges method selector
;   if eax = 3 -> tesselate, new vert in intersected point

      push     ebp
      mov      ebp,esp
      sub      esp,76

      .minx_ptr         equ dword [ebp-4]   ;
      .minx_ptr2        equ dword [ebp-8]   ;
      .srt_ptr          equ dword [ebp-12]  ; sorted tris
      .ed_srt           equ dword [ebp-16]

      .points_count_var equ dword [ebp-64]
      .trian_count_var  equ dword [ebp-60]
      .points_r_ptr     equ dword [ebp-56]
      .triangles_ptr    equ dword [ebp-52]
      .po_rot_ptr       equ dword [ebp-48]
      .points_n_rot_ptr equ dword [ebp-44]
      .edges_ptr        equ dword [ebp-40]
      .edges_count      equ dword [ebp-36]

      .vert_sum         equ dword [ebp-40]  ; \  not xchg
      .tri_sum          equ       [ebp-44]  ; /  order
      .t_ptr            equ dword [ebp-48]
      .v_ptr            equ dword [ebp-52]

      cld
      lea      esi,[points_count_var]
      lea      edi,.points_count_var
      cld
      mov      ecx,8
      rep      movsd

      push     eax
      mov      [thread_params+4],eax
      cmp      al,2
      je       @f
      cmp      al,1
      je       @f
      prompt   prompt_det_coll_ed  ; macro
      jmp      .f
    @@:
      prompt   prompt_smooth_in_ed
      jmp      .no_mallo
    .f:
      cmp      byte [thread_params+4],3
      jne      .no_mallo
      mov      eax,[triangles_count_var]
      shr      eax,3
      add      eax,200
      malloc   eax
      ; memory for tri to rem msk
      mov      [thread_params+16],eax
      mov      edi,eax
      mov      ecx,[triangles_count_var]
      shr      ecx,3 + 2
      add      ecx,10
      xor      eax,eax
      cld
      rep      stosd
   .no_mallo:
      mov      ebx,.points_count_var
      add      ebx,80
      imul     ebx,12
      malloc   ebx
      mov      [thread_params],eax
      push     eax
      ; **************************
      ; speed up mem structs below
      ; **************************
      mov      eax,.trian_count_var
      add      eax,100
      imul     eax,12
      malloc   eax
      ; sorted  triangles list
      mov      .srt_ptr,eax
      mov      [thread_params+12],eax

      mov      eax,.trian_count_var
      add      eax,100
      shl      eax,3
      push     eax
      malloc   eax
      mov      .minx_ptr,eax
      ; ptr to tri indices + min tri and max ed 'x' coord
      pop      eax
      malloc   eax
      mov      .minx_ptr2,eax

      mov      esi,.triangles_ptr
      xor      ecx,ecx
      mov      edi,eax
      cld
      mov      eax,10000
      cvtsi2ss xmm3,eax
      shl      eax,2
      cvtsi2ss xmm4,eax
    @@:
      lodsd
      xchg     ebx,eax
      lodsd
      xchg     edx,eax
      lodsd
      push     esi
      imul     ebx,12
      imul     edx,12
      imul     eax,12
      mov      esi,.points_r_ptr
      movups   xmm0,[eax+esi]
      movups   xmm1,[ebx+esi]
      movups   xmm2,[edx+esi]
      minps    xmm0,xmm1
      minps    xmm0,xmm2
      mulps    xmm0,xmm3
      addps    xmm0,xmm4
      cvtss2si eax,xmm0
      stosd
      mov      eax,ecx
      stosd
      pop      esi
      inc      ecx
      cmp      ecx,.trian_count_var
      jne      @b

      mov      edi,.minx_ptr
      mov      esi,.minx_ptr2
      mov      ecx,.trian_count_var
      call     sort_hybrid
      ; sort to allow early reject from chck loop
      mov      esi,.minx_ptr2
      mov      edi,.srt_ptr
      mov      ecx,.trian_count_var
      cld
     @@:
      lodsd
      lodsd
      imul     eax,12
      add      eax,.triangles_ptr
      xchg     esi,eax
      movsd
      movsd
      movsd
      xchg     esi,eax
      loop     @b
      ; str_ptr = adress to sorted triangles list
      ; minimal X coord is taken as key sort order

     ; **************************
     ; speed up mem structs above
     ; **************************

     mfree   .minx_ptr
     mfree   .minx_ptr2

     pop     eax
     mov     edi,eax
     mov     ecx,.points_count_var
     lea     ecx,[ecx*3]
     mov     eax,-1
     cld
     rep     stosd

     mov     eax,mark_colided_edges_th
     xor     edx,edx
     mov     ecx,4  ;'max'            ; use 4 cpu threads
     call    call_thread

     cmp     [thread_params+4],byte 3 ; tess ?
     jne     .noTess

     ;   thread_params+20  ++
     ;   granularity 16 bytes
     ;   definitions like this:
     ;
     ;  .new_vert_list_ptr          + 0
     ;  .curr_new_vert_index        + 4
     ;  .curr_new_tris_index        + 8
     ;  .new_tris_list_ptr          + 12

     mov     esi,thread_params+20
     pxor    xmm0,xmm0
     mov     ecx,4
    @@:
     movups  xmm1,[esi]
     paddd   xmm0,xmm1
     add     esi,16
     loop    @b

     shufps  xmm0,xmm0,11111001b
     movlps  .tri_sum,xmm0 ; t and v sums
     movd    eax,xmm0      ; sum of tris
     or      eax,eax
     jz      .skip_cnt
     add     eax,[triangles_count_var]

     add     eax,100
     imul    eax,12
     malloc  eax
     mov     .t_ptr,eax

     mov     edi,eax
     mov     esi,[thread_params+12]
     ; triangles list != [triangles_ptr] list
     mov     edx,[triangles_count_var]
     xor     ecx,ecx
   .filter:
     mov     ebx,ecx
     mov     eax,ecx
     cld
     shr     ebx,3
     and     eax,111b
     add     ebx,[thread_params+16]
     bt      [ebx],eax    ; check mask
     jc      @f           ; if tri should be removed
     movsd
     movsd
     movsd
     jmp     .f4
   @@:
     add     esi,12
   .f4:
     inc     ecx
     cmp     ecx,edx
     jnz     .filter

     mov     eax,[thread_params+16]  ; free mem mask
     mfree   eax

     mov     ecx,4                    ;   \
     lea     esi,[thread_params+20]   ;
     mov     edx,[points_count_var]   ;
   .llfix:                            ;
     push    ecx                      ;      |
     push    esi                      ;
     mov     ecx,[esi+4]   ; t cnt    ;      |
     mov     esi,[esi]     ; t ptr    ;
     or      ecx,ecx                  ;      |
     jz      .nxa                     ;
     lea     ecx,[ecx*3]              ;
   .lfix:                             ;
     lodsd                            ;
     btr     eax,31                   ;      >  fix t lists
     jnc     @f                       ;
     add     eax,edx                  ;
   @@:                                ;      |
     stosd                            ;
     loop    .lfix                    ;      |
   .nxa:                              ;
     pop     esi                      ;      |
     add     edx,[esi+8]  ; v cnt     ;
     add     esi,16                   ;
     pop     ecx                      ;    /
     loop    .llfix                   ;  /
     ; t list is renewed

     mov     ecx,.vert_sum
     add     ecx,[points_count_var]
     add     ecx,100
     imul    ecx,12
     malloc  ecx
     mov    .v_ptr,eax
     mov     edi,eax
     mov     ecx,[points_count_var]
     mov     esi,[points_r_ptr]
     lea     ecx,[ecx*3]
     cld
     rep     movsd

     mov     ecx,4
     mov     esi,thread_params+20
   .v_lis:
     push    esi ecx
     mov     ecx,[esi+8]  ; v cnt
     mov     esi,[esi+12] ; v ptr
     lea     ecx,[ecx*3]
     rep     movsd
     pop     ecx esi
     add     esi,16
     loop    .v_lis
     ; v list is renewed

     mov     ecx,4
     mov     esi,thread_params+20
   .fre:
     push    esi ecx
     mov     eax,[esi]
     mov     ebx,[esi+12]
     push    eax
     mfree   ebx
     pop     eax
     mfree   eax
     pop     ecx esi
     add     esi,16
     loop    .fre

     mfree   [points_r_ptr]
     mov     eax,.v_ptr
     mov     [points_r_ptr],eax
     mfree   [triangles_ptr]
     mov     ebx,.t_ptr
     mov     [triangles_ptr],ebx
     mov     ecx,.tri_sum
     mov     edx,.vert_sum
     add     [triangles_count_var],ecx
     add     [points_count_var],edx
     jmp     .skip_cnt

   .noTess:
     mov     esi,[thread_params]
     mov     edi,[points_r_ptr]
     cld
     mov     ecx,[points_count_var]
   .update:
     cmp     [esi],dword -1
     je      @f
     movsd
     movsd
     movsd
     loop   .update
     jmp    .en_u
   @@:
     add     esi,12
     add     edi,12
     loop    .update
   .en_u:
     pop     eax           ; mark 0, 1, 2 or 3
     cmp     eax,3
     je      @f
     or      eax,eax
     jnz     .skip_cnt     ; smooth ins edges cause
   @@:
     mov     ecx,[edges_count]
     shr     ecx,3
     inc     ecx
     mov     esi,[edges_intersect_ptr]
     xor     ebx,ebx
   .pop_count:
     push    ecx
     xor     ecx,ecx
   @@:
     mov     eax,[esi]
     shr     eax,cl
     and     eax,1b
     add     ebx,eax
     inc     ecx
     cmp     ecx,8
     jne     @b
     inc     esi
     pop     ecx
     loop    .pop_count
     mov     [edges_coll_count],ebx
   .skip_cnt:
     mfree   dword[thread_params]
     mfree   .srt_ptr
     cls     ; macro
     mov     esp,ebp
     pop     ebp
ret
;==================================================================
mark_colided_edges_th:
;in: - some globals
;      dword[thread_params]    = ptr to modified vertices list
;      dword[thread_params+4]  = 1 -> smooth jagged inside ed,
;                              = 2 -> smooth ed. sum angles around
;                                     base vertex algo,
;                              = 0 -> mark coll edges feature,
;                              = 3 -> adaptive tess
;      dword[thread_params+12] = sorted tris list
;      if "3" adaptive tess then [thread_params+16] = tris mask ptr
;out:
;      [edges_intersect_ptr] = colided tri list, each tri as bit,
;                              each colided edge marked as 1, otherwise 0
;      [inner_vert_ptr]      = inside vertices mask list,
;                              each vert as bit, if set = outside
;      if adaptive tess
;      thread_params+20 ++ 16 bytes granularity related to each th_No
;       .new_vert_list_ptr    + 0
;       .curr_new_vert_index  + 4
;       .curr_new_tris_index  + 8
;       .new_tris_list_ptr    + 12

   push    ebp
   mov     ebp,esp
   sub     esp,332
   and     ebp,-16
   sub     ebp,128

   .th_No                  equ dword [ebp-4]
   .tri_cnt                equ dword [ebp-8]
   .curr_new_vert_list_ptr equ dword [ebp-12]
   .EdLen                  equ dword [ebp-16]
   .edStart                equ       [ebp-20]
   .edEnd                  equ       [ebp-24]
   .thread_params4         equ byte  [ebp-25]
   .new_list_mark          equ byte  [ebp-26]

   .intersect_tri equ          dword [ebp-32]
   ; triangles coords (vertices)
   .p2t                    equ       [ebp-48]
   .p3t                    equ       [ebp-64]
   .vertEd1                equ       [ebp-80]   ; \
   .vertEd2                equ       [ebp-96]   ; /  not xchg order
   .indexEd1               equ       [ebp-112]  ; \  broadcasted index
   .indexEd2               equ       [ebp-128]  ; /  not xchg order

   .new_vert_list_ptr      equ dword [ebp-132]
   .curr_new_vert_index    equ dword [ebp-136]
   .curr_new_tris_index    equ dword [ebp-140]
   .new_tris_list_ptr      equ       [ebp-144]
   .curr_new_tris_list_ptr equ dword [ebp-148]

   .p1t              equ [ebp]
   .tri_norm         equ [ebp+16]
   .value            equ [ebp+32]
   .tcv3             equ [ebp+48]
   .ang_sum          equ [ebp+52]
   .max_el           equ [ebp+56]
     ; ..
   .normEd           equ [ebp+64]
   .t                equ [ebp+80]
   .vx               equ [ebp+84]
   .vy               equ [ebp+88]

   .points_count_var equ dword[ebp+96]
   .trian_count_var  equ dword[ebp+100]
   .points_r_ptr     equ dword[ebp+104]
   .triangles_ptr    equ dword[ebp+108]
   .epsilon          equ dword[ebp+112]
   .max_ptr          equ dword[ebp+116]
   .edges_ptr        equ dword[ebp+120]
   .edges_count      equ dword[ebp+124]

   cld
   lea      esi,[points_count_var]
   lea      edi,.points_count_var
   cld
   mov      ecx,8
   rep      movsd
   mov      .th_No,ebx
   mov      eax,[thread_params+12]    ;  sorted tris
   mov      .triangles_ptr,eax
   ; max 'x' coord  ed
   mov      eax,[thread_params+4]
   mov      .thread_params4,al
   cmp      al,3   ; tess ?
   jne      @f
   mov      .max_el,dword 1490 ; maximum elements
   malloc   12*1500  ; 1.5 tausend new verts max
   mov      .curr_new_vert_list_ptr,eax
   mov      .new_vert_list_ptr,eax
   malloc   12*1500  ; 1.5 tausend new tris max
   mov      .curr_new_tris_list_ptr,eax
   mov      .new_tris_list_ptr,eax
   xor      eax,eax
   mov      .curr_new_vert_index,eax
   mov      .curr_new_tris_index,eax
  @@:
   mov      .epsilon,0.001
   mov      ecx,.trian_count_var
   mov      .intersect_tri,intersect_tri
   lea      ecx,[ecx*3]
   mov      .tcv3,ecx
   mov      edx,- 0.008889
   movd     xmm0,edx
   shufps   xmm0,xmm0,0
   movaps   .value,xmm0    ; margin value
if 0
   ; calc reversed teslate area
   mov      esi,matrix_scaled            ; check if in tes area
   lea      edi,[.mxx]                   ; only using indexes
   call     reverse_mx_3x3               ; not reverse tes area
   cvtdq2ps xmm3,[xxadd]
   movlhps  xmm3,xmm3
   movaps   xmm0,[tri_area_x1]
   subps    xmm0,xmm3
   movlps   [.ar_rev1],xmm0
   movhps   [.ar_rev1+12],xmm0
   xor      eax,eax
   mov      [.ar_rev1+8],eax    ; zero 'z'
   mov      [.ar_rev2+8],eax
   lea      esi,[.ar_rev1]
   lea      edi,[.ar_rev2]
   mov      ecx,2
   call     rotary
   movlps   xmm0,[.ar_rev2]
   movhps   xmm0,[.ar_rev2+12]
   ; .ar_rev2  - reversed tes area
end if
   xor      edi,edi
   mov      ecx,4 ;[CoresCount]
   mov      eax,.edges_count
   xor      edx,edx
   idiv     ecx
   mov      ebx,eax
   imul     eax,.th_No
   mov      .edStart,eax
   add      eax,ebx
   dec      ecx

   cmp      .th_No,ecx
   jnz      @f
   add      eax,edx
 @@:
   mov      .edEnd,eax
   mov      esi,.edges_ptr
   mov      ebx,.edStart
   shl      ebx,3
   add      esi,ebx
   mov      ecx,.edStart
 .nx_ed:    ; next edge
   push     ecx
   cld
   xor      eax,eax
   mov      .new_list_mark,al

   ; movlps   xmm0,[esi]
   ; movlps   xmm1,[esi]
   lodsd
   xchg     eax,ebx
   lodsd
   push     esi
   lea      edi,.indexEd2
   stosd
   stosd
   stosd
   stosd
   xchg     ebx,eax
   stosd
   stosd
   stosd
   stosd
   xchg     eax,ebx
   ; shufps   xmm0,xmm0,0
   ; shufps   xmm1,xmm1,01010101b
   pcmpeqd  xmm7,xmm7
   ; movaps   .indexEd1,xmm0
   ; movaps   .indexEd2,xmm1
   imul     eax,12
   imul     ebx,12
   mov      edi,.points_r_ptr
   psrldq   xmm7,4
   movups   xmm1,[ebx+edi]
   movups   xmm2,[eax+edi]
   andps    xmm2,xmm7 ;.zer_hgst
   andps    xmm1,xmm7 ;.zer_hgst
   movaps   .vertEd1,xmm2
   movaps   .vertEd2,xmm1
   subps    xmm2,xmm1
   movaps   xmm3,xmm2
   mulps    xmm2,xmm2
   haddps   xmm2,xmm2
   haddps   xmm2,xmm2
   sqrtps   xmm2,xmm2
   movss    .EdLen,xmm2
   rcpps    xmm2,xmm2
   mulps    xmm3,xmm2
   andps    xmm3,xmm7 ;.zer_hgst
   movaps   .normEd,xmm3
   mov      edi,.triangles_ptr
   xor      eax,eax
   mov      .tri_cnt,eax
 .nx_tri:   ; next triangle
   push     edi
   push     esi
   mov      esi,edi
   cld
   movups   xmm6,[esi]
   movlps   xmm3,.max_el
   lodsd
   xchg     eax,edx
   lodsd
   xchg     eax,ebx
   lodsd
   push     eax
   mov      esi,.points_r_ptr
   cmp      .new_list_mark,1
   jne          @f
   mov      esi,.new_vert_list_ptr 
   shufps   xmm3,xmm3,0
   pcmpgtd  xmm6,xmm3
   movmskps eax,xmm6
   and      eax,111b
   jnz      .break     
  @@:
   pop      eax
   imul     eax,12
   imul     ebx,12
   imul     edx,12
   
   ; if hgst bit set -> old points_r_ptr
   ; this occur only one time per tri so cmov multpile
   ; times  can be used
   movups   xmm6,[eax+esi]   ; .p1t  =  tri 1st vertex
   movups   xmm3,[ebx+esi]   ; .p2t
   movups   xmm4,[edx+esi]   ; .p3t

   movups   .p1t,xmm6
   movups   .p2t,xmm3
   movups   .p3t,xmm4

   movaps   xmm2,xmm6
   maxps    xmm2,xmm3
   maxps    xmm2,xmm4
   minps    xmm3,xmm4
   minps    xmm3,xmm6

   movaps   xmm4,.vertEd1
   movaps   xmm1,.vertEd2
   maxps    xmm4,xmm1
   movaps   xmm0,xmm3
   subss    xmm0,.epsilon

   comiss   xmm0,xmm4
   ; min of tri greater than max ed ?
   ; 'X' coord is taken into account
   jna      @f
   ; break
   add      esp,8
   jmp      .nx_loop_end
 @@:
   cmpltps  xmm4,xmm3
   minps    xmm1,.vertEd1
   cmpltps  xmm1,xmm2
   xorps    xmm1,xmm4
   movmskps eax,xmm1
   or       eax,eax
   jnz      .chck
 .end_l_tri:
   pop      esi
   pop      edi
   add      edi,12
   inc      .tri_cnt
   mov      edx,.tri_cnt
   cmp      edx,.trian_count_var 
   jna      .nx_tri

   cmp      .thread_params4,3
   jne      .nx_loop_end
   mov      .new_list_mark,1
   sub      edx,.trian_count_var
   cmp      edx,.curr_new_tris_index
   ja       .nx_loop_end

   imul     edx,12
   add      edx,.new_tris_list_ptr
   mov      edi,edx

   jmp      .nx_tri
 .nx_loop_end:
   pop      esi
   pop      ecx
   inc      ecx
   cmp      ecx,.edEnd
   jne      .nx_ed
   jmp      .end
;intersect_tri: procs header
; in:
;     xmm0 - ray direction  ; should be normalized
;     xmm1 - ray orgin
;     xmm2 - tri vert1
;     xmm3 - tri vert2
;     xmm4 - tri vert3
;     if  eax = 1 - intersction with edge
;        xmm6 - edge lenght
;     if  eax = 0 - intersect with ray (classic)
; out:
;     eax  = 1 - intersection occured
;     xmm0 - float lo -> hi = t, v, u, ...
; @@:
 .chck:
   cmp      .new_list_mark,1
   je       @f
   movups   xmm0,[edi]
   movups   xmm1,[edi]
   pcmpeqd  xmm0,.indexEd1
   pcmpeqd  xmm1,.indexEd2
   orps     xmm1,xmm0
   cld
   movmskps eax,xmm1
   and      eax,111b
   or       eax,eax
   jnz      .end_l_tri
 @@:
   movups   xmm2,xmm6 ;.p1t
   movups   xmm3,.p2t
   movups   xmm4,.p3t

   movaps   xmm0,.normEd
   movaps   xmm1,.vertEd2
   movss    xmm6,.EdLen
   mov      eax,1
   call     .intersect_tri
   movaps   .t,xmm0
   cmp      eax,1
   jne      .end_l_tri
 .inter:
   ; [edi] - tri
   ; [esi] - edge
   cmp      .thread_params4,3  ;  = 3 -> tess
   je       .tess
   cmp      .thread_params4,0  ;  = 0 -> mark
   jne      .no_mark
 .mrk:
   mov      eax,ecx  ;.tri_cnt ; ecx = edge No
   mov      ebx,ecx
   shr      eax,3
   and      ebx,111b
   add      eax,[edges_intersect_ptr]
   lock     bts [eax],ebx
 .no_mark:
   cmp      .thread_params4,0
   je       .nx_end          ; check smooth inside edges cause

   cmp      .thread_params4,2
   je       .second_method   ; bit other algo for smothing jagged edges
 .tess:
   mov      ebx,.tri_cnt
   imul     ebx,12
   add      ebx,[triangles_normals_ptr]
   movups   xmm5,[ebx]
   movaps   .tri_norm,xmm5
   shufps   xmm0,xmm0,0  ; xmm0 - t
   mulps    xmm0,.normEd
   addps    xmm0,.vertEd2
   ; xmm0 --- intersection point coords
   movaps   xmm7,xmm0
   subps    xmm0,.vertEd1
   mulps    xmm0,.tri_norm
   haddps   xmm0,xmm0
   haddps   xmm0,xmm0   ; dot
   movmskps eax,xmm0

   movaps   xmm0,.normEd
   mulps    xmm0,.value ; remains margin value
;   cmp      .thread_params4,3 ; 3 -> tess
;   je       .tess2
   and      eax,1b
   cmp      eax,1b
   jne      @f
   addps    xmm7,xmm0
   mov      ebx,.indexEd1
   jmp      .fix
  @@:
   mov      ebx,.indexEd2
   subps    xmm7,xmm0
  .fix:
   imul     ebx,12
   movhlps  xmm6,xmm7
   cmp      .thread_params4,3 ; 3 -> tess
   je       .tess2
   add      ebx,[thread_params]
   movlps   [ebx],xmm7
   movss    [ebx+8],xmm6
   jmp      .nx_end
 .second_method:      ; 2cond algo based on summing angles around
                      ; base vertex. I assume both edge verices
                      ; are not boundary if both sine angles sums
                      ; are above 3.8.
   movaps   xmm6,xmm0 ; t ....
   xor      eax,eax
   mov      .ang_sum,eax
   lea      ebx,.indexEd1
 .calc_ang_sum:
   push     ebx
   mov      ebx,[ebx]
   mov      esi,.triangles_ptr
   cld
   xor      ecx,ecx
   xorps    xmm7,xmm7
 .search:
   lodsd
   cmp      ebx,eax
   je       @f
   inc      ecx
   cmp      ecx,.tcv3 ;eax
   jnz      .search
   jmp      .ang_done
 @@:
   push     esi
   mov      edx,ecx
   mov      eax,ecx
   xor      edx,edx
   push     ebx
   mov      ebx,3
   div      ebx
   pop      ebx

   push     edx
   shr      eax,2
   ; edx = tri index
   mov      esi,eax             ; i tried sum all angles around vert with
   imul     esi,12              ; .indexEd1 and .indexEd2
   add      esi,.triangles_ptr
   mov      edi,.points_r_ptr
   lodsd
   imul     eax,12
   movups   xmm0,[eax+edi]
   lodsd
   imul     eax,12
   movups   xmm1,[eax+edi]
   lodsd
   imul     eax,12
   movups   xmm2,[eax+edi]
   movaps   xmm3,xmm0
   subps    xmm0,xmm1
   subps    xmm1,xmm2
   subps    xmm2,xmm3
   movaps   xmm3,xmm0
   movaps   xmm4,xmm1
   movaps   xmm5,xmm2
   dpps     xmm0,xmm0,01110111b
   dpps     xmm1,xmm1,01110111b
   dpps     xmm2,xmm2,01110111b
   rsqrtps  xmm0,xmm0

   rsqrtps  xmm1,xmm1
   rsqrtps  xmm2,xmm2            ;         1
   mulps    xmm0,xmm3            ;    xm0/  \
   mulps    xmm1,xmm4            ;     /     \
   mulps    xmm2,xmm5            ;   0 \      \
   pop      edx                  ;       \     \ xm1
;   and      edx,111b            ;         \    \
   cmp      edx,1                ;           \   \
   ; angle "1"                   ;       xm2   \  \     xm3 = pi - xm0 - xm1
   jne      @f                   ;               \ \
   dpps     xmm0,xmm1,01110111b  ;                 \
   movaps   xmm5,xmm0            ;                   2
   jmp      .ang
  @@:
   cmp      edx,2
   jne      @f
   ; angle "2"
   dpps     xmm2,xmm1,01110111b
   movaps   xmm5,xmm2
   jmp      .ang
  @@:
   ; angke "0"
   dpps     xmm0,xmm2,01110111b
   movaps   xmm5,xmm0
 .ang:
;   xorps    xmm1,xmm1
   movaps   xmm2,[the_one]
;   comiss   xmm5,xmm1        ; is cosinus negative ?
;   ja       @f
;   xorps    xmm5,[sign_mask] ; .. so negate,
;   addps    xmm5,xmm2        ; add one,
;   movaps   xmm2,xmm5        ; and add to angles sum instead sinus..
;   jmp      .adan
  @@:
   ; I calc sinus of angles ..
   mulps    xmm5,xmm5
   subps    xmm2,xmm5
   sqrtps   xmm2,xmm2  ; sin
 .adan:
   ; ... and add to xm7
   addps    xmm7,xmm2
   pop      esi
   inc      ecx
   cmp      ecx,.tcv3 ;eax
   jnz      .search
 .ang_done:
   pop      ebx
   mov      eax,[ebx]
   cmp      eax,.indexEd1
   jne      @f
   movss    .ang_sum,xmm7
   sub      ebx,16
   jmp      .calc_ang_sum
  @@:
   mov      ebx,3.8
   movlps   xmm5,.ang_sum
   movd     xmm4,ebx
   mov      eax,.indexEd1
   comiss   xmm7,xmm5      ; second (indexEd2) sum is greater?
   ja       .Ed1_outer     ; means vertex with first Ed1 index is outer
   mov      eax,.indexEd2
 .Ed1_outer:
   cmpnltss xmm7,xmm4
   cmpnltss xmm5,xmm4
   andps    xmm5,xmm7
   movmskps ebx,xmm5
   ; I assume if both sums of sinuses are
   ; below 3.8 means both verts are not border.
   and      ebx,1b
   cmp      ebx,1
   je       .nx_end
   movaps   xmm0,xmm6
   shufps   xmm0,xmm0,0    ; xmm0 - t
   mulps    xmm0,.normEd
   addps    xmm0,.vertEd2  ; vert Ed 2 is ray origin
   ; xmm0 -- intersection point coords
   movaps   xmm7,.normEd
   mulps    xmm7,.value    ; remains margin value
   movaps   xmm5,.vertEd1
   cmp      eax,.indexEd2
   je       @f
   movaps   xmm5,.vertEd2
 @@:
   movaps   xmm6,xmm0
   movaps   xmm4,xmm0
   addps    xmm6,xmm7
   subps    xmm4,xmm7
   movaps   xmm0,xmm6
   movaps   xmm2,xmm4
   subps    xmm6,xmm5
   subps    xmm4,xmm5
   dpps     xmm6,xmm6,01110111b
   dpps     xmm4,xmm4,01110111b
   comiss   xmm6,xmm4 ; compare lenghts^2
   jna      @f
   movaps   xmm0,xmm2 ; choose value appropriate to greater lenght
 @@:
   imul     eax,12
   movhlps  xmm6,xmm0
   add      eax,[thread_params]
   movlps   [eax],xmm0
   movss    [eax+8],xmm6
   jmp      .nx_end
 .tess2:

     mov      eax,1490
     cmp      .curr_new_vert_index,eax   ; too many ?
     ja       .nx_end
     cmp      .curr_new_tris_index,eax
     ja       .nx_end

     mov      eax,.tri_cnt
     mov      ebx,eax                 ; mark tri should be removed
     shr      eax,3                   ;
     and      ebx,111b                ;
     add      eax,[thread_params+16]  ;
     bts      [eax],ebx
     jc       .nx_end    ; Only one vert per tri to avoid clenched tris.
                         ; Try avoid unnormalised tris too!
     ; .triangles_ptr is not [triangles_ptr] !!
     ; .triangles_ptr is sorted

     ; xmm7 new vert position
     mov      edi,.tri_cnt               
     cmp      .new_list_mark,1
     jne      @f
     sub      edi,.trian_count_var
     imul         edi,12
     mov      edi,.curr_new_tris_list_ptr
     jmp      .mv
    @@:                                      ;           xm0
     imul     edi,12                         ;         /  |  \
     add      edi,.triangles_ptr             ;        / a |   \
.mv: movlps   xmm0,[edi]                     ;       /   eax c \
     movlps   xmm1,[edi+4]                   ;      /  /     \  \
     movlps   xmm2,[edi+8]                   ;     / /    b    \ \
     mov      ebx,.curr_new_vert_list_ptr    ;    xm1 ---------- xm2 
     movups   [ebx],xmm7
     mov      eax,.curr_new_vert_index
     bts      eax,31 ; set hgst bit of new vert index as marker
                     ; of new vert index
     mov      edi,.curr_new_tris_list_ptr

; Mozna zrobic zadanie przez rozszerzenie listy trojkatow
; a nie wpisywanie do nowej. Jednoczesnie mozna anulowac stary trojkat.
; Pamietaj, ze poszczegolny watek dziala tylko  na swoim kawalku listy.
; Pamietaj ze lista jest posortowana.
; Moze lepiej w przypadku (zawsze) po wczesnym wyjsciu z petli przejzec
; jeszcze nowa liste? (dla th_par4) = 3. 

     cld
     ; tri b to new list
     movlps  [edi],xmm1    ; 1 & 2
     add     edi,8
     stosd

     ; tri c to new list
     movlps  [edi],xmm0
     add     edi,4
     stosd
     movlps  [edi],xmm2
     add     edi,4

     ; tri a to new list
     stosd
     movlps  [edi],xmm0    ; 0 & 1
     add     edi,8

     mov    .curr_new_tris_list_ptr,edi
     inc    .curr_new_vert_index
     add    .curr_new_vert_list_ptr,12
     add    .curr_new_tris_index,3

 .nx_end:
   pop      esi
   pop      edi
   jmp      .nx_loop_end
 .break:
   add      esp,20
 .end:
   cmp      .thread_params4,3  ; tess ?
   jne      .ntes
   movups   xmm0,.new_tris_list_ptr
   mov      eax,.th_No
   shl      eax,4
   add      eax,thread_params+20
   movups   [eax],xmm0
 .ntes:
   add      esp,332
   pop      ebp
ret
;==========================================================================
intersect_tri: ; Moeller-Trumbore method
; in:
;     xmm0 - ray direction  ; should be normalized
;     xmm1 - ray orgin
;     xmm2 - tri vert1
;     xmm3 - tri vert2
;     xmm4 - tri vert3
;     if  eax = 1 - intersction with edge
;        xmm6 - edge lenght
;     if  eax = 0 - intersect with ray (classic)
; out:
;     eax  = 1 - intersection occured
;     xmm0 - float lo -> hi = t, v, u, ...
      push    ebp
      mov     ebp,esp
      and     ebp,-16
      sub     esp,280
      sub     ebp,128      ; make shorter locals adreses

       .dir    equ [ebp-16]
       .origin equ [ebp-32]
       .ta     equ [ebp-48]
       .tb     equ [ebp-64]
       .tc     equ [ebp-80]
       .tvec   equ [ebp-96]
       .pvec   equ [ebp-112]
       .qvec   equ [ebp-128]
       .e1     equ [ebp]
       .ift    equ dword[ebp+16]
       .invdet equ [ebp+20]
       .det    equ [ebp+24]
       .ed_l   equ [ebp+28]
       .u      equ [ebp+40]
       .v      equ [ebp+36]
       .t      equ [ebp+32]
       .e2        equ [ebp+48]
       .zero_hgst equ [ebp+64]
       .eps       equ [ebp+80] ;  \
       .epsminus  equ [ebp+84] ;  |    dont change order
       .epsone    equ [ebp+88] ;  /
     ;  .cross_r   equ dword[ebp+96]

      movups    xmm7,[eps_inter]
      movaps    .eps,xmm7
      movaps    xmm7,[zero_hgst]
      movaps   .dir,xmm0
      movaps   .origin,xmm1
      movaps   .ta,xmm2
      movaps   .tb,xmm3
      movaps   .tc,xmm4
      movaps   .zero_hgst,xmm7
      mov      .ift,eax
      movss    .ed_l,xmm6
      subps    xmm3,xmm2
      subps    xmm4,xmm2
      andps    xmm3,xmm7
      andps    xmm4,xmm7
      movaps   .e1,xmm3
      movaps   .e2,xmm4
      movaps   xmm0,.dir
      movaps   xmm1,xmm4
      call     cross_reg
      ; movaps   .pvec,xmm0
      movaps   xmm1,.e1
      mulps    xmm1,xmm0  ;.pvec

      haddps   xmm1,xmm1
      haddps   xmm1,xmm1
      ; dpps     xmm0,.pvec,01110111b
      movss    .det,xmm1
      comiss   xmm1,.eps
      jl       @f

      rcpss    xmm2,.det
      movss    .invdet,xmm2
      movaps   xmm2,.origin
      subps    xmm2,.ta
      andps    xmm2,.zero_hgst
      movaps   .tvec,xmm2
      mulps    xmm2,xmm0  ;.pvec
      haddps   xmm2,xmm2
      haddps   xmm2,xmm2
      ; dpps    xmm0,.pvec,01110111b
      mulss    xmm2,.invdet
      movaps    xmm1,xmm2
      movss    .u,xmm2
      cmpnless xmm1,.epsone
      cmpnless xmm2,[epsminus]
      xorps    xmm1,xmm2
      movd     eax,xmm1
      or       eax,eax
      jz       @f

      movaps   xmm0,.tvec
      movaps   xmm1,.e1
      call     cross_reg
      ; movaps   .qvec,xmm0
      movaps   xmm1,.dir
      mulps    xmm1,xmm0  ;.qvec

      haddps   xmm1,xmm1
      haddps   xmm1,xmm1
      ; dpps     xmm0,.qvec,01110111b
      mulss    xmm1,.invdet
      movss    .v,xmm1
      movaps   xmm2,xmm1
      addss    xmm2,.u
      cmpnless xmm2,.epsone
      cmpnless xmm1,.epsminus
      xorps    xmm1,xmm2
      movd     eax,xmm1
      or       eax,eax
      jz       @f
      movaps   xmm1,.e2
      mulps    xmm1,xmm0  ;.qvec
      haddps   xmm1,xmm1
      haddps   xmm1,xmm1
      mulss    xmm1,.invdet
      movss    .t,xmm1
      comiss   xmm1,.eps
      jl       @f
      xor      ebx,ebx
      mov      eax,1
      cmp      .ift,ebx
      je       .end         ; ok intersect occured, no edge cause
      movlps   xmm0,.t      ; else check with edge lenght
      cmpnless xmm0,.eps
      cmpnless xmm1,.ed_l
      xorps    xmm0,xmm1
      movd     ebx,xmm0
      or       ebx,ebx
      jz       @f
      jmp      .end
   @@:
      xor      eax,eax
    .end:
      movaps   xmm0,.t
      add      esp,280
      pop      ebp
ret

;=============================================================
;=============================================================
;=============================================================
;=============================================================
;=============================================================
remove_non_tri:
  pushad
  pcmpeqd xmm7,xmm7
  mov     esi,[triangles_ptr]
  mov     ecx,[triangles_count_var]
 .chck:
  jecxz   .cop
  mov     eax,[esi]
  cmp     eax,[esi+4]
  je      .tri_fail
  cmp     eax,[esi+8]
  je      .tri_fail
  mov     eax,[esi+4]
  cmp     eax,[esi+8]
  je      .tri_fail
  add     esi,12
  loop    .chck
  jmp     .cop
 .tri_fail:
  movlps  [esi],xmm7
  movlps  [esi+4],xmm7
  add     esi,12
  loop    .chck
 .cop:
  mov     esi,[triangles_ptr]
  mov     edi,[triangles_ptr]
  mov     ecx,[triangles_count_var]
  xor     edx,edx
  cls
  or      eax,-1
 .cp:
  cmp     [esi],eax
  je      .ff
  movsd
  movsd
  movsd
  inc     edx
  loop    .cp
  jmp     .en_cp
 .ff:
  add     esi,12
  loop    .cp
 .en_cp:
  mov     [triangles_count_var],edx
  popad
ret
;============================================================
;============================================================
do_variable_slices_ptr_buff:
; Procedure match every screen pixel with ptr, wich describes
; its depth history. If dword ptr = byte cnt = 0, not depth
; history, pixel has background color.
; in:  esi - counter buff
;      edi - ptr buff
;      eax - voxels depth history buff
;      ecx - count of all screen pixels
       jecxz  .end
       xor    ebx,ebx
     .again:
       movzx  edx,byte[esi]
       or     edx,edx
       jz     .zero
       mov    [edi],eax
       shl    edx,2
       add    eax,edx
       jmp    .u_cnts
     .zero:
       mov    [edi],ebx
     .u_cnts:
       inc    esi
       add    edi,4
       loop   .again
    .end:
ret
;================================================
count_transparent_voxels: ; Thanks Agner Fog for optimiztion
                          ; phaddb on sse2   :D
; in:
;    edi - ptr to slices_cnt buffer
;          this buff describes how many
;          slices has every rendered pixel
;    eax - size x of screen
;    ebx - size y of scr
; out:
;    ecx - count of all transparent voxels

     push     ebp
     mul      ebx
     mov      ecx,eax
     and      eax,1111b
     shr      ecx,4
     xor      ebx,ebx
     xorps    xmm3,xmm3
   @@:
     movups   xmm1,[edi]
     xorps    xmm0,xmm0
     psadbw   xmm1,xmm0
     pshufd   xmm0,xmm1,0x0e
     paddd    xmm0,xmm1
     paddd    xmm3,xmm0
     add      edi,16
     loop     @b
     mov      ecx,eax
     jecxz    .en
   @@:
     movzx    eax,byte[edi]
     add      ebx,eax
     loop     @b
   .en:
     movd     ecx,xmm3
     add      ecx,ebx
     pop      ebp
ret
